In [479]:
#imports
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.animation as animation
import seaborn as sns
from scipy import stats
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.linear_model import Ridge
In [ ]:
#data processing
#csv to pandas
exoDf = pd.read_csv('ExoplanetData.csv')

#remove HD from HD name column, redundant, also remove A
exoDf['hd_name'] = exoDf['hd_name'].str.extract('(\d+)', expand=False)
exoDf['hd_name'] = pd.to_numeric(exoDf['hd_name'], errors='coerce').fillna(exoDf['hd_name'])
exoDf['hd_name'] = exoDf['hd_name'].astype('Int64')
In [ ]:
hdDf = pd.read_csv('HDCatalogue.csv', sep=';')
In [76]:
#currently there are 121 different columns, lets reduce this to some usable ones.
#lets keep time discovered, name, plus anything that can be applied for regression, 
#leaving out flags, discovery methods, etc. 

#lets make a model that predicts based on our exoDf, the number of planets

#only keep necessary columns

#can make 3d plot of all observed exoplanets

columns_to_keep = ['pl_name', 'hostname', 'hd_name', 'hip_name', 'tic_id',
            'gaia_id', 'sy_snum', 'sy_pnum', 'sy_mnum', 'pl_orbper', 
            'pl_orbsmax', 'pl_rade', 'pl_masse', 'pl_dens', 'pl_orbeccen',
            'st_spectype', 'st_teff', 'st_rad', 'st_mass', 'st_met', 'st_lum', 
            'st_logg', 'st_age', 'st_dens', 'ra', 'dec', 'sy_dist', 'sy_plx',
            'sy_bmag', 'sy_vmag', 'sy_kepmag']

exoDf = exoDf[columns_to_keep]
In [482]:
#make a 3d plot of all the stars? test this out

#converts degree in range 0-360 to a radian in range -pi to pi
def deg_from_neg_pi_to_pi(deg):
    return (deg * np.pi / 180) - np.pi

#converts a degree in range -90 to 90 to a radian in range -pi/2 to pi/2
def deg_to_rad_90(deg):
    return (deg * np.pi / 180)

raHd = hdDf['_RAJ2000'].apply(deg_from_neg_pi_to_pi)
raExo = exoDf['ra'].apply(deg_from_neg_pi_to_pi)

decHd = hdDf['_DEJ2000'].apply(deg_to_rad_90)
decExo = exoDf['dec'].apply(deg_to_rad_90)

# Set a higher DPI for better resolution
fig = plt.figure(figsize=(15, 10), dpi=200)
ax = plt.subplot(111, projection='aitoff')

# Initialize the plot with initial data
sc_hd = ax.scatter(raHd, decHd, s=1, c='blue', alpha=0.1)
sc_exo = ax.scatter(raExo, decExo, s=1, c='red', alpha=0.3)
ax.set_xlabel('Right Ascension')
ax.set_ylabel('Declination')
ax.set_title('Systems with Confirmed Exoplanets')
ax.grid(True)

# Update function for animation
def update(frame):
    global sc_hd, sc_exo, sc_zero_line
    # Shift in radians
    shift_radians = np.deg2rad(frame) % (2 * np.pi)
    
    # Update the RA for HD stars
    new_ra_hd = ((raHd + shift_radians + np.pi) % (2 * np.pi)) - np.pi
    # Update the RA for exoplanets
    new_ra_exo = ((raExo + shift_radians + np.pi) % (2 * np.pi)) - np.pi
    
    # Update the scatter plot data
    sc_hd.set_offsets(np.column_stack((new_ra_hd, decHd)))
    sc_exo.set_offsets(np.column_stack((new_ra_exo, decExo)))

    return sc_hd, sc_exo

#uncomment if you want to make the animation!
ani = animation.FuncAnimation(fig, update, frames=np.arange(0, 360, 3), interval = 100, blit=True, repeat=True)
ani.save('stars_animation.mp4', writer='ffmpeg', dpi=200)

plt.show()
No description has been provided for this image
In [ ]:
#And the animation looks like this!

exoplanet animation

In [243]:
#lets make anothe graph of discovered exoplanets, with earth at the center



#cartesian
x = exoDf['sy_dist'] * np.cos(np.deg2rad(exoDf['dec'])) * np.cos(np.deg2rad(exoDf['ra']))
y = exoDf['sy_dist'] * np.cos(np.deg2rad(exoDf['dec'])) * np.sin(np.deg2rad(exoDf['ra']))
z = exoDf['sy_dist'] * np.sin(np.deg2rad(exoDf['dec']))


# Create a 3D scatter plot
fig = plt.figure(figsize=(12, 8), dpi=200)
ax = fig.add_subplot(111, projection='3d')

#plot
scatter = ax.scatter(x, y, z, c='red', marker='o', alpha=0.1)

# Set labels and title
ax.set_title('Cartesian plot of Confirmed Exoplanet Systems')

ax.set_xlim([-1500, 1500])
ax.set_ylim([-8000, 8000])
ax.set_zlim([-5000, 5000])

ax.set_xticklabels([])
ax.set_yticklabels([])
ax.set_zticklabels([])


def update(frame):
    ax.view_init(elev=10, azim=frame)  # Rotate the view
    return scatter,

#uncomment if you want to make the animation!
#ani = animation.FuncAnimation(fig, update, frames=np.arange(0, 360, 2), interval=100)
#ani.save('circular_3d_plot.gif', writer='pillow', dpi=200)

plt.show()
No description has been provided for this image
In [ ]:
#And the animation looks like this!

cartesian animation

In [364]:
#lets make the model for the regression, visualize what we are working with
#relate spectral type and num of exoplanets

#extract just the spectral type symbol
exoDf['st_spectype'] = exoDf['st_spectype'].str[:2]

#group by spectype and find the mean of planets of that spectype
avg_exoplanets = exoDf.groupby('st_spectype')['sy_pnum'].mean()

#sort highest to lowest
avg_exoplanets_sorted = avg_exoplanets.sort_values(ascending=False)

plt.figure(figsize=(15, 10))
avg_exoplanets_sorted.plot(kind='bar', color='green')
plt.xlabel('Spectral Type')
plt.ylabel('Average Number of Exoplanets')
plt.title('Average Number of Exoplanets by Spectral Type')
plt.xticks(rotation=45)
plt.grid(axis='y')
plt.tight_layout()
plt.show()
No description has been provided for this image
In [415]:
#relate age with num of exoplanets

#mean num of exoplanets for each age
mean_exoplanets_by_age = exoDf.groupby('st_age')['sy_pnum'].mean()

slope, intercept, r_value, p_value, std_error =  stats.linregress(mean_exoplanets_by_age.index, mean_exoplanets_by_age.values)
print("p value of: " + str(p_value))

plt.figure(figsize=(15, 10))
plt.scatter(mean_exoplanets_by_age.index.values, mean_exoplanets_by_age.values, marker='o', color='black', alpha=0.5)
plt.plot(mean_exoplanets_by_age.index.values, 
        intercept + slope * mean_exoplanets_by_age.index.values, color='red', linestyle='-', 
        linewidth=2)
plt.xlabel('Stellar Age (Billions of Years)')
plt.ylabel('Number of Exoplanets')
plt.title('Number of Exoplanets by Stellar Age')
plt.grid(True)
plt.tight_layout()
plt.show()
p value of: 0.009726610410438824
No description has been provided for this image
In [414]:
#relate stellar mass and number of exoplanets


# Filter the DataFrame to include only observations with stellar mass between 0 and 5 
#solar masses, this eliminates outliers in our data
filtered_exoDf = exoDf[(exoDf['st_mass'] >= 0) & (exoDf['st_mass'] <= 5)]

#mean num of exoplanets for each stellar mass
mean_exoplanets_by_mass = filtered_exoDf.groupby('st_mass')['sy_pnum'].mean()

slope, intercept, r_value, p_value, std_error =  stats.linregress(mean_exoplanets_by_mass.index, mean_exoplanets_by_mass.values)
print("p value of: " + str(p_value))

plt.figure(figsize=(15, 10))
plt.scatter(mean_exoplanets_by_mass.index.values, mean_exoplanets_by_mass.values, marker='o', color='black', alpha=0.5)
plt.plot(mean_exoplanets_by_mass.index.values, 
        intercept + slope * mean_exoplanets_by_mass.index.values, color='red', linestyle='-', 
        linewidth=2)
plt.xlabel('Stellar Mass (Solar Masses)')
plt.ylabel('Number of Exoplanets')
plt.title('Number of Exoplanets by Stellar Mass')
plt.grid(True)
plt.tight_layout()
plt.show()
p value of: 2.487538431838445e-17
No description has been provided for this image
In [413]:
#lets relate density to number of exoplanets

# Filter the DataFrame to include only observations with stellar density between 0 and 150 g/cm^3
#this eliminates outliers in our data
filtered_exoDf = exoDf[(exoDf['st_dens'] >= 0) & (exoDf['st_dens'] <= 150)]

#mean num of exoplanets for each stellar density
mean_exoplanets_by_density = filtered_exoDf.groupby('st_dens')['sy_pnum'].mean()

slope, intercept, r_value, p_value, std_error =  stats.linregress(mean_exoplanets_by_density.index, mean_exoplanets_by_density.values)
print("p value of: " + str(p_value))

plt.figure(figsize=(15, 10))
plt.scatter(mean_exoplanets_by_density.index.values, mean_exoplanets_by_density.values, marker='o', color='black', alpha=0.5)
plt.plot(mean_exoplanets_by_density.index.values, 
        intercept + slope * mean_exoplanets_by_density.index.values, color='red', linestyle='-', 
        linewidth=2)
plt.xlabel('Stellar Density (g/cm^3)')
plt.ylabel('Number of Exoplanets')
plt.title('Number of Exoplanets by Stellar Density')
plt.grid(True)
plt.tight_layout()
plt.show()
p value of: 0.01653586935233488
No description has been provided for this image
In [416]:
#relate radius to num of exoplanets

#mean num of exoplanets for each stellar radii
mean_exoplanets_by_radius = exoDf.groupby('st_rad')['sy_pnum'].mean()

slope, intercept, r_value, p_value, std_error =  stats.linregress(mean_exoplanets_by_radius.index, mean_exoplanets_by_radius.values)
print("p value of: " + str(p_value))

plt.figure(figsize=(15, 10))
plt.scatter(mean_exoplanets_by_radius.index.values, mean_exoplanets_by_radius.values, marker='o', color='black', alpha=0.5)
plt.plot(mean_exoplanets_by_radius.index.values, 
        intercept + slope * mean_exoplanets_by_radius.index.values, color='red', linestyle='-', 
        linewidth=2)
plt.xlabel('Stellar Radius (Solar Radii)')
plt.ylabel('Number of Exoplanets')
plt.title('Number of Exoplanets by Stellar Radius')
plt.grid(True)
plt.tight_layout()
plt.show()
p value of: 3.1861626015923364e-11
No description has been provided for this image
In [417]:
#relate metallicity to num of exoplanets

# Filter the DataFrame to include only observations with stellar metallicity between 0 and 1 dex
# this eliminates outliers in our data
filtered_exoDf = exoDf[(exoDf['st_met'] >= 0) & (exoDf['st_met'] <= 1)]

#mean num of exoplanets for each stellar metallicity
mean_exoplanets_by_met = filtered_exoDf.groupby('st_met')['sy_pnum'].mean()

slope, intercept, r_value, p_value, std_error =  stats.linregress(mean_exoplanets_by_met.index, mean_exoplanets_by_met.values)
print("p value of: " + str(p_value))

plt.figure(figsize=(15, 10))
plt.scatter(mean_exoplanets_by_met.index.values, mean_exoplanets_by_met.values, marker='o', color='black', alpha=0.5)
plt.plot(mean_exoplanets_by_met.index.values, 
        intercept + slope * mean_exoplanets_by_met.index.values, color='red', linestyle='-', 
        linewidth=2)
plt.xlabel('Stellar Metallicity (dex)')
plt.ylabel('Number of Exoplanets')
plt.title('Number of Exoplanets by Stellar Metallicity')
plt.grid(True)
plt.tight_layout()
plt.show()

'''Note this p value is above standard 0.05, so the null hypothesis is not rejected
lets avoid using it in our regression''';
p value of: 0.5358958010334814
No description has been provided for this image
In [418]:
#relate effective temperature to number of exoplanets

# Filter the DataFrame to include only observations with stellar effective temperature
# between 0 and 10000 K. this eliminates outliers in our data
filtered_exoDf = exoDf[(exoDf['st_teff'] >= 0) & (exoDf['st_teff'] <= 10000)]

#mean num of exoplanets for each stellar effective temperature
mean_exoplanets_by_teff = filtered_exoDf.groupby('st_teff')['sy_pnum'].mean()

slope, intercept, r_value, p_value, std_error =  stats.linregress(mean_exoplanets_by_teff.index, mean_exoplanets_by_teff.values)
print("p value of: " + str(p_value))

plt.figure(figsize=(15, 10))
plt.scatter(mean_exoplanets_by_teff.index.values, mean_exoplanets_by_teff.values, marker='o', color='black', alpha=0.5)
plt.plot(mean_exoplanets_by_teff.index.values, 
        intercept + slope * mean_exoplanets_by_teff.index.values, color='red', linestyle='-', 
        linewidth=2)
plt.xlabel('Stellar Effective Temperature (K)')
plt.ylabel('Number of Exoplanets')
plt.title('Number of Exoplanets by Stellar Effective Temperature')
plt.grid(True)
plt.tight_layout()
plt.show()
p value of: 2.0631232108630326e-13
No description has been provided for this image
In [419]:
#relate surface gravity to num of exoplanets
# Filter the DataFrame to include only observations with stellar metlalicity between 0 and 6
# log10(cm/s^2) surface gravity. this eliminates outliers in our data
filtered_exoDf = exoDf[(exoDf['st_logg'] >= -2) & (exoDf['st_logg'] <= 6)]

#mean num of exoplanets for each log10(cm/s^2) surface gravity
mean_exoplanets_by_logg = filtered_exoDf.groupby('st_logg')['sy_pnum'].mean()

slope, intercept, r_value, p_value, std_error =  stats.linregress(mean_exoplanets_by_logg.index, mean_exoplanets_by_logg.values)
print("p value of: " + str(p_value))

plt.figure(figsize=(15, 10))
plt.scatter(mean_exoplanets_by_logg.index.values, mean_exoplanets_by_logg.values, marker='o', color='black', alpha=0.5)
plt.plot(mean_exoplanets_by_logg.index.values, 
        intercept + slope * mean_exoplanets_by_logg.index.values, color='red', linestyle='-', 
        linewidth=2)
plt.xlabel('Stellar Surface Gravity (log10(cm/s^2))')
plt.ylabel('Number of Exoplanets')
plt.title('Number of Exoplanets by Stellar Surface Gravity')
plt.grid(True)
plt.tight_layout()
plt.show()
p value of: 8.664601243134237e-29
No description has been provided for this image
In [454]:
#lets apply these 6 terms (stellar age, stellar mass, stellar density, 
#  stellar radius, stellar effective temperature, stellar surface gravity) to a 
# regression model, so we can predict the number of exoplanets in a given system

#first, lets do mean imputation on those columns, to get rid of na
columns_to_impute = ['st_age', 'st_mass', 'st_dens', 'st_rad', 'st_teff', 'st_logg']
for column in columns_to_impute:
    exoDf[column] = exoDf[column].fillna(exoDf[column].mean())

#define features and target
X = exoDf[['st_age', 'st_mass', 'st_dens', 'st_rad', 'st_teff', 'st_logg']]
y = exoDf['sy_pnum']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = LinearRegression()

model.fit(X_train, y_train)


# Predict the number of exoplanets for the test set
y_pred = model.predict(X_test)

# Calculate and print metrics
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f"Mean Squared Error: {mse}")
print(f"R-squared: {r2}")

# Plot the results
plt.figure(figsize=(15, 10))
plt.scatter(y_test, y_pred, color='blue', alpha=0.5)
plt.xlabel('Actual Number of Exoplanets')
plt.ylabel('Predicted Number of Exoplanets')
plt.title('Actual vs Predicted Number of Exoplanets')
plt.grid(True)
plt.show()
Mean Squared Error: 1.4288107134911496
R-squared: 0.016702616477324184
No description has been provided for this image
In [474]:
# Define the cost function (mean squared error)
def compute_cost(X, y, theta):
    m = len(y)
    predictions = X.dot(theta)
    cost = np.sum(np.square(predictions - y)) / (2 * m)
    return cost

# Gradient descent function
def gradient_descent(X, y, theta, alpha, num_iterations):
    m = len(y)
    cost_history = []
    
    for i in range(num_iterations):
        # Calculate the predictions
        predictions = X.dot(theta)
        
        # Calculate the error
        error = predictions - y
        
        # Calculate the gradient
        gradient = X.T.dot(error) / m
        
        # Update parameters
        theta -= alpha * gradient
        
        # Compute the cost
        cost = compute_cost(X, y, theta)
        cost_history.append(cost)
        
        # Print cost every 100 iterations
        #if i % 100 == 0:
            #print(f"Iteration {i}: Cost = {cost}")
    
    return theta, cost_history

# Add a column of ones to X for the intercept term
X_train_with_intercept = np.c_[np.ones((X_train.shape[0], 1)), X_train]

# Initialize parameters (weights)
theta = np.zeros(X_train_with_intercept.shape[1])

# Set hyperparameters
alpha = 0.00000001
num_iterations = 10000

# Run gradient descent
theta, cost_history = gradient_descent(X_train_with_intercept, y_train, theta, alpha, num_iterations)

# Print final parameters
print("Final Parameters:", theta)



# Add a column of ones to X_test for the intercept term
X_test_with_intercept = np.c_[np.ones((X_test.shape[0], 1)), X_test]

# Predict using the trained model
y_pred = X_test_with_intercept.dot(theta)

# Calculate Mean Squared Error (MSE)
mse = np.mean((y_test - y_pred) ** 2)

# Plot the actual vs predicted values
'''
plt.figure(figsize=(10, 6))
plt.scatter(y_test, y_pred, color='blue', alpha=0.5)
plt.plot([min(y_test), max(y_test)], [min(y_test), max(y_test)], color='red')  # Add a diagonal line
plt.xlabel('Actual Number of Exoplanets')
plt.ylabel('Predicted Number of Exoplanets')
plt.title('Actual vs Predicted Number of Exoplanets (Gradient Descent)')
plt.grid(True)
plt.show() '''

# Print MSE
print("Mean Squared Error:", mse)

# Plot loss curve
plt.plot(range(len(cost_history)), cost_history)
plt.xlabel('Epoch')
plt.ylabel('Mean Squared Error')
plt.title('Loss Curve')
plt.show()
Final Parameters: [ 7.92327467e-06  4.83185211e-05  5.02741468e-07  4.25230969e-05
 -5.30444439e-06  3.29997589e-04  3.97050269e-05]
Mean Squared Error: 1.5937838075520914
No description has been provided for this image